# -*- coding: utf-8 -*-

"""
Datetime: 2020/03/01
Author: Zhang Yafei
Description: 
"""
import requests
from pandas import DataFrame
import pymongo
import re
import datetime

from SinaWeiboSpider.settings import MONGO_URI, DB_NAME


def time_fix(time_string):
    now_time = datetime.datetime.now()
    if '分钟前' in time_string:
        minutes = re.search(r'^(\d+)分钟', time_string).group(1)
        created_at = now_time - datetime.timedelta(minutes=int(minutes))
        return created_at.strftime('%Y-%m-%d %H:%M')

    if '小时前' in time_string:
        minutes = re.search(r'^(\d+)小时', time_string).group(1)
        created_at = now_time - datetime.timedelta(hours=int(minutes))
        return created_at.strftime('%Y-%m-%d %H:%M')

    if '今天' in time_string:
        return time_string.replace('今天', now_time.strftime('%Y-%m-%d'))

    if '月' in time_string:
        time_string = time_string.replace('月', '-').replace('日', '')
        time_string = str(now_time.year) + '-' + time_string
        return time_string

    return time_string


keyword_re = re.compile('<span class="kt">|</span>|原图|<!-- 是否进行翻译 -->|')
emoji_re = re.compile('<img alt="|" src="//h5\.sinaimg(.*?)/>')
white_space_re = re.compile('<br />')
div_re = re.compile('</div>|<div>')
image_re = re.compile('<img(.*?)/>')
url_re = re.compile('<a href=(.*?)>|</a>')


def extract_weibo_content(weibo_html):
    s = weibo_html
    if '转发理由' in s:
        s = s.split('转发理由:', maxsplit=1)[1]
    if 'class="ctt">' in s:
        s = s.split('class="ctt">', maxsplit=1)[1]
    s = s.split('赞', maxsplit=1)[0]
    s = keyword_re.sub('', s)
    s = emoji_re.sub('', s)
    s = url_re.sub('', s)
    s = div_re.sub('', s)
    s = image_re.sub('', s)
    if '<span class="ct">' in s:
        s = s.split('<span class="ct">')[0]
    s = white_space_re.sub(' ', s)
    s = s.replace('\xa0', '')
    s = s.strip(':')
    s = s.strip()
    return s


def extract_comment_content(comment_html):
    s = comment_html
    if 'class="ctt">' in s:
        s = s.split('class="ctt">', maxsplit=1)[1]
    s = s.split('举报', maxsplit=1)[0]
    s = emoji_re.sub('', s)
    s = keyword_re.sub('', s)
    s = url_re.sub('', s)
    s = div_re.sub('', s)
    s = image_re.sub('', s)
    s = white_space_re.sub(' ', s)
    s = s.replace('\xa0', '')
    s = s.strip(':')
    s = s.strip()
    return s


def get_fangfang_urls():
    client = pymongo.MongoClient(MONGO_URI)
    weibo_urls = client[DB_NAME]['weibo'].find({},{'_id':0, 'weibo_url': 1})
    urls = []
    for weibo_url in weibo_urls:
        weibo_id = weibo_url['weibo_url'].rsplit('/')[-1]
        url = f'https://weibo.cn/comment/{weibo_id}?ckAll=1&page=1'
        urls.append(url)
    return urls


def get_comment_url():
    f = open('fangfang_weibo_urls.txt', 'w')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
        'Cookie': '_T_WM=98425059843; SUB=_2A25zfKIHDeRhGeNK71AV8y3Mwz2IHXVQns5PrDV6PUJbktANLWfSkW1NSX0x7yGmH3rvwWO4cufwvLcT1Acn8ALI; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhJ4ghp97xQ9zYjbSLg-nod5JpX5KzhUgL.Fo-XShzXe0e71h22dJLoIp7LxK-LB.eLBK5LxKBLB.2L1-2LxK.LBKeL1--0; SUHB=0tKqkT5m9tYz4q'}
    for num, url in enumerate(get_fangfang_urls()):
        response = requests.get(url=url, headers=headers)
        all_page = re.search(r'1/(\d+)页', response.text)
        if all_page:
            all_page = all_page.group(1)
            all_page = int(all_page)
            if all_page >= 50:
                for page_num in range(1, 51):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    f.write(f'{page_url}\n')
            else:
                for page_num in range(1, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    f.write(f'{page_url}\n')
        else:
            f.write(f'{num}\t{url}\n')
        print(f'{url}下载完成')
    f.close()


if __name__ == '__main__':
    # get_comment_url()
    client = pymongo.MongoClient(MONGO_URI)
    data = client[DB_NAME]['comment'].find()
    df = DataFrame(data=data)
    df.rename(columns={'weibo_url': 'weibo_url_page'}, inplace=True)
    df['weibo_url'] = df['weibo_url_page'].str.extract('(.*?)&page')
    df.to_excel('评论.xlsx', index=False)